This script reproduces the analyses presented in Kim and Schneider et
al. 2026 Cell paper. The code starts from the read count matrices and
associated sample and gene info files available from GEO (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE308213),
stored in the data/ folder in this repository.
library(tidyverse)
## Plotting
library(ggplot2)
library(ggrepel)
library(ggforce)
library(ggpubr)
library(RColorBrewer)
myPalette <- c('#e6194b', '#4363d8', '#3cb44b', '#984EA3', '#f58231', '#ffe119', '#F781BF', '#808080', '#98BFDB', '#bcf60c', '#008080', '#e6beff', '#E5C494', '#000075', '#CD00CD', '#aaffc3', '#808000', '#9a6324', '#fffac8', '#800000', '#000000', '#ffffff')
## Gene annotation
library(ensembldb)
library(AnnotationHub)
## Needed?
#library(org.Mm.eg.db)
#library(RSQLite)
## Single-cell analysis
library(SingleCellExperiment)
library(scater)
library(scran)
library(batchelor)
library(bluster)
## DE analysis
library(edgeR)
library(limma)
## Trimming funciton
trimMat <- function(x, trim=0.001) {
trim <- min(trim[1], 1 - trim[1])
lo <- quantile(x, trim, na.rm = TRUE)
hi <- quantile(x, 1 - trim, na.rm = TRUE)
x[x < lo] = lo
x[x > hi] = hi
x
}
## Save session info
writeLines(capture.output(devtools::session_info()), "sessionInfo.txt")
## Matrix of UMI counts for the scRNA-seq data
tab <- read.table(file = "data/GSE286541_UMIsMatrix.txt.gz", sep = "\t", h=T)
## Information on the cells
metadata <- read.table(file = "data/GSE286541_CellMetadata.txt.gz", sep = "\t", h=T, quote = "", check.names = F)
row.names(metadata) <- make.names(row.names(metadata))
sce <- SingleCellExperiment(list(counts=as(tab, "sparseMatrix")),
colData=DataFrame(metadata, check.names = FALSE))
## Information on the genes
rowData(sce)$GENEID <- row.names(sce)
ah <- AnnotationHub()
## snapshotDate(): 2024-04-30
edb <- ah[["AH113713"]] # Ensembl 110
## loading from cache
rowData(sce)$SYMBOL <- mapIds(edb, key=row.names(sce), keytype="GENEID", column="SYMBOL")
## Warning: Unable to map 6 of 33805 requested IDs.
rowData(sce)$DESCRIPTION <- mapIds(edb, key=row.names(sce), keytype="GENEID", column="DESCRIPTION")
## Warning: Unable to map 6 of 33805 requested IDs.
rowData(sce)$GENEBIOTYPE <- mapIds(edb, key=row.names(sce), keytype="GENEID", column="GENEBIOTYPE")
## Warning: Unable to map 6 of 33805 requested IDs.
row.names(sce) <- uniquifyFeatureNames(rowData(sce)$GENEID, rowData(sce)$SYMBOL)
## Fill-in information manually for the constructs
rowData(sce)[c("Ighv-MOG", "Neomycin", "Tdtomato", "Cre", "Ighv-FluBI", "Igkv-FluBI"),]$SYMBOL <- c("Ighv-MOG", "Neomycin", "Tdtomato", "Cre", "Ighv-FluBI", "Igkv-FluBI")
rowData(sce)[c("Ighv-MOG", "Neomycin", "Tdtomato", "Cre", "Ighv-FluBI", "Igkv-FluBI"),]$GENEBIOTYPE <- c("construct")
## Clean-up some of the cell metadata
sce$Cluster <- factor(sce$Cluster)
sce$AnnotatedCluster <- sce$Cluster
levels(sce$AnnotatedCluster) <- c("[1] GC-like 1", "[1] GC-like 1", "[2] transitional", "[3] naïve", "[1] GC-like 1", "[4] follicular 1", "[5] myeloid-like", "[6] plasma", "[7] follicular 2", "[8] pre-B", "[9] GC-like 2")
sce$Strain <- factor(sce$Strain, levels=c( "WT", "MOG", "FluBI" ))
sce$`Transgenic cell` <- factor(sce$`Transgenic cell`, levels=c("Not_transgenic", "MOG", "FluBI"))
sce$Day <- factor(sce$Day, levels=c("day7", "day21"))
sce$Sample <- factor(sce$Sample, level=c("FluBI_day7.rep1", "FluBI_day7.rep2", "FluBI_day7.rep3",
"FluBI+flu_day7.rep1", "FluBI+flu_day7.rep2", "FluBI+flu_day7.rep3",
"FluBI_day21.rep1", "FluBI_day21.rep2", "FluBI_day21.rep3",
"FluBI+flu_day21.rep1", "FluBI+flu_day21.rep2", "FluBI+flu_day21.rep3",
"MOG_day7.rep1", "MOG_day7.rep2", "MOG_day7.rep3",
"MOG_day21.rep1", "MOG_day21.rep2", "MOG_day21.rep3",
"MOG_Brain.rep1", "MOG_Brain.rep2", "MOG_Brain.rep3", "MOG_Brain.rep4",
"MOG_LN.rep1", "MOG_LN.rep2",
"WT+CD40L_Brain.rep1", "WT+CD40L_Brain.rep2", "WT+CD40L_Brain.rep3", "WT+CD40L_Brain.rep4",
"WT+CD40L_LN.rep1", "WT+CD40L_LN.rep2", "WT+CD40L_LN.rep3", "WT+CD40L_LN.rep4",
"MOG+CD40L_Brain.rep1", "MOG+CD40L_Brain.rep2", "MOG+CD40L_Brain.rep3", "MOG+CD40L_Brain.rep4",
"MOG+CD40L_LN.rep1", "MOG+CD40L_LN.rep2", "MOG+CD40L_LN.rep3", "MOG+CD40L_LN.rep4"))
## Normalization
sizeFactors(sce) <- sce$`Normalization size factors`
sce <- logNormCounts(sce, size.factors = sizeFactors(sce), center.size.factors = FALSE, name="logcounts")
## QC metrics
is.mito <- grepl("^mt-", rowData(sce)$SYMBOL)
is.ribo <- grepl("^Rp(l|s)", rowData(sce)$SYMBOL)
stats <- perCellQCMetrics(sce, subsets=list(MT=is.mito,
RIBO=is.ribo),
flatten=FALSE)
sce$scater_qc <- stats
## Modelling the technical noise in gene expression, fitting a trend assuming that technical noise is poisson distributed
set.seed(100)
dec.pois <- modelGeneVarByPoisson(sce)
rowData(sce)$decomposeVar <- dec.pois
## Extract HVGs
hvg.out <- getTopHVGs(rowData(sce)$decomposeVar, n=700)
## Remove MT, Ribo genes, VDJ genes and constructs
hvg.out <- hvg.out[!hvg.out %in% row.names(sce)[is.mito |
is.ribo |
rowData(sce)$GENEBIOTYPE %in% c("construct", "IG_C_gene", "IG_C_pseudogene",
"IG_D_gene", "IG_J_gene", "IG_LV_gene",
"IG_V_gene", "IG_V_pseudogene", "TR_C_gene",
"TR_D_gene", "TR_J_gene", "TR_J_pseudogene",
"TR_V_gene", "TR_V_pseudogene")]]
## PCA and denoising steps using the Poisson trend
sce <- runPCA(sce, subset_row=hvg.out)
set.seed(1000)
sce <- denoisePCA(sce,
technical=rowData(sce)$decomposeVar,
subset.row=hvg.out)
ncol(reducedDim(sce, "PCA")) ## 34 PCs retained
## [1] 34
## Integration across samples
set.seed(1234)
mnn.out <- fastMNN(sce,
batch=sce$Sample,
k=15,
subset.row=hvg.out,
BSPARAM=BiocSingular::RandomParam(deferred=TRUE))
set.seed(1234)
mnn.out <- runTSNE(mnn.out,
perplexity = 50,
dimred="corrected")
## Add to SCE object
reducedDim(sce, "MNN") <- reducedDim(mnn.out, "corrected")
reducedDim(sce, "TSNE_MNN") <- reducedDim(mnn.out, "TSNE")
## NOTE: due to package version differences, tSNE coordinates might differ from what is used in the article. The exact tSNE coordinates used in the article are available is the colData(sce) slot
## Clustering
set.seed(1234)
clust.k5 <- clusterCells(sce,
use.dimred="MNN",
BLUSPARAM=NNGraphParam(k=5, cluster.fun=c("louvain")))
table(clust.k5)
## clust.k5
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 2576 1941 1191 780 3649 4077 1051 928 2871 1499 734 141 130
## NOTE: due to package version differences, clustering might differ from what is used in the article (accessible at colData(sce)$Cluster)
table(colData(sce)$Cluster)
##
## 1 2 3 4 5 6 7 8 9 10 11
## 2375 1921 1334 511 3421 4081 3261 870 2886 664 244
## Prepare a metadata table to be used below for some plotting
pheno <- colData(sce)[, c("Sample", "Condition", "Condition_full", "Day", "Strain", "Tissue")] |> as_tibble() |> dplyr::distinct() |> as.data.frame()
row.names(pheno) <- pheno$Sample
pheno$Condition_full <- factor(pheno$Condition_full, levels=c("MOG_day7", "FluBI_day7", "FluBI+flu_day7", "MOG_day21", "FluBI_day21", "FluBI+flu_day21", "MOG_Brain", "MOG+CD40L_Brain", "WT+CD40L_Brain", "MOG_LN", "MOG+CD40L_LN", "WT+CD40L_LN"))
pheno$Condition <- factor(pheno$Condition)
## Reorder levels for days
pheno$Day <- factor(pheno$Day, levels=c("day7", "day21", "day28"))
myTheme1 <- theme_bw(base_size = 20) +
theme(panel.grid.major = element_line(colour = "lightgrey", linewidth = 0.2),
panel.grid.minor = element_line(colour = "lightgrey", linewidth = 0.05),
legend.position="right", legend.direction = "vertical",
legend.text=element_text(size=16),
axis.title=element_text(size=20),
axis.text=element_text(size=10),
panel.spacing.x = unit(1.2, "lines"),
axis.text.x=element_blank(), ## Axes labels do not make much sense for tSNE plots
axis.ticks.x=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
## Figure 3B: tSNE showing clusters, for experiment 1 only
myDf <- data.frame(colData(sce), check.names=F)
subset <- myDf$Condition_full %in% c("MOG_day21", "FluBI_day21", "MOG_day7", "FluBI_day7")
f3b <- ggplot(myDf[subset, ], aes(x=`tSNE1 (MNN corrected)`, y=`tSNE2 (MNN corrected)`, colour=AnnotatedCluster)) +
geom_point(size=1, alpha=0.5) +
scale_color_manual(name="", values=myPalette[c(2,3,4,6,7,8,9,10,11)]) +
guides(colour = guide_legend(override.aes = list(size=3, alpha=1), ncol=1)) +
myTheme1
f3b
## Figure 3C: Transgenic cells or not, split by strain and days
ann_ellipse <- data.frame(x0 = 12, y0 = -1, a = 18, b = 11, angle = 0, Strain = factor(c("MOG"), levels=levels(sce$Strain)))
ann_ellipse2 <- data.frame(x0 = 12, y0 = 22, a = 7, b = 10, angle = 0, Strain = factor(c("MOG"), levels=levels(sce$Strain)))
levels(myDf$`Transgenic cell`) <- c("non-transgenic", "transgenic", "transgenic")
f3c <- ggplot(myDf[subset, ], aes(x=`tSNE1 (MNN corrected)`, y=`tSNE2 (MNN corrected)`,
colour=`Transgenic cell`, size=`Transgenic cell`, alpha=`Transgenic cell`)) +
geom_point() +
geom_ellipse(data=ann_ellipse, inherit.aes = FALSE,
aes(x0 = x0, y0 = y0, a = a, b = b, angle = angle),
colour="maroon1", linewidth = 1.5, show.legend=FALSE) +
geom_ellipse(data=ann_ellipse2, inherit.aes = FALSE,
aes(x0 = x0, y0 = y0, a = a, b = b, angle = angle),
colour="goldenrod1", linewidth = 1.5, show.legend=FALSE) +
scale_color_manual(name="", values=myPalette[c(2,1,3)]) +
scale_size_manual(name="", values=c(0.6,1,0)) +
scale_alpha_manual(name="", values=c(0.5, 0.8,0)) +
facet_grid(vars(Day), vars(Strain)) +
guides(colour = guide_legend(override.aes = list(size=4, alpha=1), nrow=1)) +
myTheme1 +
theme(strip.background = element_blank(),
legend.position="bottom")
f3c